import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
sns.set(color_codes=True)
%matplotlib inline
import sys
!{sys.executable} -m pip install pandas-profiling[notebook]
import pandas_profiling
import warnings
warnings.filterwarnings('ignore')
Loan=pd.read_csv('Loan_Modelling.csv')#save the data as Cardio
Univeraite and bivariate analysis
pandas_profiling.ProfileReport(Loan) #a profile of the data
Loan.info()#see missing info, size, names, and datatyes
It seems to me that ID isn't great and ZIP may be the same
plt.hist(Loan['ID'],50)#it seems like each is unique so I don't think the number of bins is important
plt.show()
This is not meaningful AT ALL
id_loan=Loan['ID']
Loan.drop(['ID'], axis=1, inplace=True)
plt.hist(Loan['Age'],44)#23-67 is 44 year spread
plt.show()
Looks pretty normal
plt.hist(Loan['Experience'],46)#(-)3-43 is46 spread
Looks pretty normal
plt.hist(Loan['Income'],20)
plt.show()
Pretty bad left tail, there are people making over 100,000
plt.hist(Loan['ZIPCode'],20)
plt.show()
No pattern, If I knew the distance from these to a center point that may be better
sns.countplot(Loan['Family'])
plt.show()
Good spread, but a good number are single and no families are above 4, weird
plt.hist(Loan['CCAvg'],20)# it looks like there are 108 unique values, so 20 should be enough
plt.show()
A lot of people dont use their credit card, or use is less than 1k a month. very few spend above 8k a month. I'd assume that most of these ae high income people, lets see
sns.lmplot(x='Income',y='CCAvg',data=Loan)
plt.show()
It seems all heavy cc users are higher income (roughly 100,000-200,000) but the reverse is not true, some "rich" people are not using their CC
sns.countplot(Loan['Education'])
plt.show()
Most are undergrads, we see a high number of high education. I wonder if their are high school or lower?
plt.hist(Loan['Mortgage'],20)
plt.show()
Almost all are zero, but this isn't bad?
sns.countplot(Loan['Personal_Loan'])
plt.show()
90%+ did not accept our loan last time
sns.countplot(Loan['Securities_Account'])
plt.show()
Almost no one had a security account at our bank
sns.countplot(Loan['CD_Account'])
plt.show()
Even ewer have a certifcate deposit account
sns.countplot(Loan['Online'])
plt.show()
A small majority do use online banking facilities
sns.countplot(Loan['CreditCard'])
plt.show()
Riughly 2/3 do not use another banks' credit card Let's start bivariate
import numpy as np
def plot_corr(df, size=19):
corr = df.corr()
fig, ax = plt.subplots(figsize=(size, size))
ax.matshow(corr)
plt.xticks(range(len(corr.columns)), corr.columns)
plt.yticks(range(len(corr.columns)), corr.columns)
for (i, j), z in np.ndenumerate(corr):
ax.text(j, i, '{:0.1f}'.format(z), ha='center', va='center')
plot_corr(Loan)
Personal loan correlation Positive high: income, ccavg, and cdaccount Negative high:None Others:ccavg and income CDaccount and credit cad securities acount and cd account Age and experience are 1!
sns.swarmplot(x='Personal_Loan',y='Age',data=Loan)
plt.show()
There is not a lot of good info here. Though no one bellow about 25 acceptes
sns.swarmplot(x='Personal_Loan',y='Experience',data=Loan)
plt.show()
Again nothing real
sns.swarmplot(x='Personal_Loan',y='Income',data=Loan)
plt.show()
No low income (bellow 50k) accepted our loans and it seems higher income will accept more often. Loans seem to be accepted more above aout 100k
sns.swarmplot(x='Personal_Loan',y='ZIPCode',data=Loan)
plt.show()
I don't see a siginificant correlation
sns.countplot(Loan['Personal_Loan'], hue=Loan['Family'])
plt.show()
SIngle people and people with only one other person in family are less likely to accept the loan and those with 3 or 4 people in the family are more likely to accept the loan
sns.swarmplot(x='Personal_Loan',y='CCAvg',data=Loan)
plt.show()
There does not seem to be a lot here but people with less than 2.5K are less likely to accept and anyone aboe 9k will accept
sns.countplot(Loan['Personal_Loan'], hue=Loan['Education'])
plt.show()
Correlation, people with avbove undegrad education are more likely to accept but undergrads are less likely
sns.boxplot(x='Personal_Loan', y='Mortgage',data=Loan)
plt.show()
Note: swarm plot was not working well here and took about 20 minutes with no result, I switched to boxplot The IQR for people who did not accept loans was smaller, but there is a significant overlap and there is no reason to believe there is a relationship
sns.countplot(Loan['Personal_Loan'], hue=Loan['Securities_Account'])
plt.show()
No clear relationship here
sns.countplot(Loan['Personal_Loan'], hue=Loan['CD_Account'])
plt.show()
People with CD accounts appear slightly more likely yo accept the loan
sns.countplot(Loan['Personal_Loan'], hue=Loan['Online'])
plt.show()
People who bank online may be slighly more likely to accept the loan
sns.countplot(Loan['Personal_Loan'], hue=Loan['CreditCard'])
plt.show()
No clear corrleation, but aivng a car from a second bank MAY lead to accepting the loan more often
Data preprocessing
Loan.isnull().sum().sort_values(ascending=False)
There are no missing values, move onto outlier Let's check numeric
Loan['Age'].mean()
Loan['Age'].median()
Age is good
Loan['Experience'].mean()
Loan['Experience'].median()
Experience is good
Loan['Income'].mean()
Loan['Income'].median()
Log income
Loan['Income_log']=np.log(Loan['Income'] + 1)
Loan['Income_log'].mean()
Loan['Income_log'].median()
Better
income_loan=Loan['Income']
Loan.drop(['Income'], axis=1, inplace=True)#no use in not keeping
Loan['ZIPCode'].mean()
Loan['ZIPCode'].median()
This is good, I think I will remove zip though as it hardly seems like we can get anyhting from it without knowing the city, type of location, or similiar
ZIP_loan=Loan['ZIPCode']
Loan.drop(['ZIPCode'], axis=1, inplace=True)#no use in not keeping
Loan['CCAvg'].mean()
Loan['CCAvg'].median()
Off enough, lets log
Loan['CCAvg_log']=np.log(Loan['CCAvg'] + 1)
Loan['CCAvg_log'].mean()
Loan['CCAvg_log'].median()
Not perfect, but better
cc_loan=Loan['CCAvg']
Loan.drop(['CCAvg'], axis=1, inplace=True)#no use in not keeping
Loan['Mortgage'].mean()
Loan['Mortgage'].median()
Way off, most people (50+% have no mortage and a smaller number drive up mean)
Loan['Mortgage_log']=np.log(Loan['Mortgage'] + 1)
Loan['Mortgage_log'].mean()
Loan['Mortgage_log'].median()
plt.hist(Loan['Mortgage_log'],20)
plt.show()
This did not work, maybe I will try zscore
from sklearn.preprocessing import StandardScaler
std_scaler = StandardScaler()
Loan['Mortgage_z']=std_scaler.fit_transform(Loan[['Mortgage']])
Loan['Mortgage_z'].mean()
Loan['Mortgage_z'].median()
plt.hist(Loan['Mortgage_z'],20)
plt.show()
Maybe a little better, but not great, I will keep
mortgage_loan=Loan['Mortgage']
Loan.drop(['Mortgage'], axis=1, inplace=True)#no use in not keeping
Loan.drop(['Mortgage_log'], axis=1, inplace=True)
y=Loan['Personal_Loan']
X=Loan.drop(['Personal_Loan'],axis=1,inplace=False)
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1) # I will just use the defalt sizes
import warnings
def fxn():
warnings.warn("deprecated", DeprecationWarning)
with warnings.catch_warnings():
warnings.simplefilter("ignore")
fxn()
from sklearn.linear_model import LinearRegression
lin_reg_model = LinearRegression()
lin_reg_model.fit(x_train, y_train)
coef_df = pd.DataFrame(
np.append(lin_reg_model.coef_, lin_reg_model.intercept_),
index=x_train.columns.tolist() + ["Intercept"],
columns=["Coefficients"],
)
coef_df
Age: as people age, they seem slighly less likely to accept the loan, the high correlation w/ experience may nullify this Experience: as peeopl become more expeirence, they seem slighly more likely to accept the lone, the high correlation w/ age may nullify this Family (note I didn't one hot encode this as 1) the assignment of of number is menaingful and 2) w may see other number of family members in the future): people with larger families seem slightly more likely to accept the loan Education (note I didn't one hot encode this as 1) the assignment of of number is menaingful and 2) w may see other number of family members in the future): more educated people may be more likely to accept the loan Security account: those without one will be more likely to accept CD account: high correlation, those with are more likely to accept Online: those that do not use are more likely to accept, contradict previous assumption Credit card: those that don't use another banks card are more likely to accept, contradict previous assumption Loged functions: these corrleations are likely more meaningful than the above implies Income: higher income means they are more likely to accept. Likely one of our best intedactors CCavg: higher spending means they will be more likely to accept Z scored: these corrleations are likely more meaningful than the above implies Mortgage: those with higher mortgages are more likely to accept
print("Original Accepted True Values : {0} ({1:0.2f}%)".format(len(Loan.loc[Loan['Personal_Loan'] == 1]), (len(Loan.loc[Loan['Personal_Loan'] == 1])/len(Loan.index)) * 100))
print("Original Accepted False Values : {0} ({1:0.2f}%)".format(len(Loan.loc[Loan['Personal_Loan'] == 0]), (len(Loan.loc[Loan['Personal_Loan'] == 0])/len(Loan.index)) * 100))
print("")
print("Training Success True Values : {0} ({1:0.2f}%)".format(len(y_train[y_train[:] == 1]), (len(y_train[y_train[:] == 1])/len(y_train)) * 100))
print("Training Success False Values : {0} ({1:0.2f}%)".format(len(y_train[y_train[:] == 0]), (len(y_train[y_train[:] == 0])/len(y_train)) * 100))
print("")
print("Test Success True Values : {0} ({1:0.2f}%)".format(len(y_test[y_test[:] == 1]), (len(y_test[y_test[:] == 1])/len(y_test)) * 100))
print("Test Success False Values : {0} ({1:0.2f}%)".format(len(y_test[y_test[:] == 0]), (len(y_test[y_test[:] == 0])/len(y_test)) * 100))
print("")
Split well
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
# function to compute adjusted R-squared
def adj_r2_score(predictors, targets, predictions):
r2 = r2_score(targets, predictions)
n = predictors.shape[0]
k = predictors.shape[1]
return 1 - ((1 - r2) * (n - 1) / (n - k - 1))
# function to compute MAPE
def mape_score(targets, predictions):
return np.mean(np.abs(targets - predictions) / targets) * 100
# function to compute different metrics to check performance of a regression model
def model_performance_regression(model, predictors, target):
"""
Function to compute different metrics to check regression model performance
model: regressor
predictors: independent variables
target: dependent variable
"""
# predicting using the independent variables
pred = model.predict(predictors)
r2 = r2_score(target, pred) # to compute R-squared
adjr2 = adj_r2_score(predictors, target, pred) # to compute adjusted R-squared
rmse = np.sqrt(mean_squared_error(target, pred)) # to compute RMSE
mae = mean_absolute_error(target, pred) # to compute MAE
mape = mape_score(target, pred) # to compute MAPE
# creating a dataframe of metrics
df_perf = pd.DataFrame(
{
"RMSE": rmse,
"MAE": mae,
"R-squared": r2,
"Adj. R-squared": adjr2,
"MAPE": mape,
},
index=[0],
)
return df_perf
# Checking model performance on train set
print("Training Performance\n")
lin_reg_model_train_perf = model_performance_regression(lin_reg_model, x_train, y_train)
lin_reg_model_train_perf
These numbers aren't good. Lets look at confusion mat
import warnings
def fxn():
warnings.warn("deprecated", DeprecationWarning)
with warnings.catch_warnings():
warnings.simplefilter("ignore")
fxn()
from sklearn.linear_model import LinearRegression
lin_reg_model = LinearRegression()
lin_reg_model.fit(x_train, y_train)
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
# Fit the model on train
model = LogisticRegression(solver="liblinear", random_state=1)
model.fit(x_train, y_train)
#predict on test
y_predict = model.predict(x_test)
coef_df = pd.DataFrame(model.coef_)
coef_df['intercept'] = model.intercept_
print(coef_df)
cm=metrics.confusion_matrix(y_test, y_predict, labels=[1, 0])
df_cm = pd.DataFrame(cm, index = [i for i in ["Actual 1"," Actual 0"]],
columns = [i for i in ["Predict 1","Predict 0"]])
plt.figure(figsize = (7,5))
sns.heatmap(df_cm, annot=True,fmt='g')
plt.show()
model_score = model.score(x_test, y_test)
print(model_score)
Not great, we can predict easily when they won't accept with specificty of 1334/(1334+17)0.987 but sensitivty of 73/(73+76)=0.490 Inisghts from this portion this isn't great, but income, mortgae, CCAvg, and CC accounts all seem imprtant
Let's do random tree
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
dTree1 = DecisionTreeClassifier(criterion = 'gini',random_state=1)
dTree1.fit(x_train, y_train)
from sklearn import metrics
## Function to calculate recall score
def get_recall_score(model):
'''
model : classifier to predict values of X
'''
pred_train = model.predict(x_train)
pred_test = model.predict(x_test)
print("Recall on training set : ",metrics.recall_score(y_train,pred_train))
print("Recall on test set : ",metrics.recall_score(y_test,pred_test))
# Accuracy on train and test
print("Accuracy on training set : ",dTree1.score(x_train, y_train))
print("Accuracy on test set : ",dTree1.score(x_test, y_test))
# Recall on train and test
get_recall_score(dTree1)
Honestly, not badd, lets vary max depth to see if it helps
dTree1 = DecisionTreeClassifier(criterion = 'gini',max_depth=1,random_state=1)
dTree1.fit(x_train, y_train)
## Function to calculate recall score
def get_recall_score(model):
'''
model : classifier to predict values of X
'''
pred_train = model.predict(x_train)
pred_test = model.predict(x_test)
print("Recall on training set : ",metrics.recall_score(y_train,pred_train))
print("Recall on test set : ",metrics.recall_score(y_test,pred_test))
# Accuracy on train and test
print("Accuracy on training set : ",dTree1.score(x_train, y_train))
print("Accuracy on test set : ",dTree1.score(x_test, y_test))
# Recall on train and test
get_recall_score(dTree1)
dTree1 = DecisionTreeClassifier(criterion = 'gini',max_depth=2,random_state=1)
dTree1.fit(x_train, y_train)
## Function to calculate recall score
def get_recall_score(model):
'''
model : classifier to predict values of X
'''
pred_train = model.predict(x_train)
pred_test = model.predict(x_test)
print("Recall on training set : ",metrics.recall_score(y_train,pred_train))
print("Recall on test set : ",metrics.recall_score(y_test,pred_test))
# Accuracy on train and test
print("Accuracy on training set : ",dTree1.score(x_train, y_train))
print("Accuracy on test set : ",dTree1.score(x_test, y_test))
# Recall on train and test
get_recall_score(dTree1)
dTree1 = DecisionTreeClassifier(criterion = 'gini',max_depth=3,random_state=1)
dTree1.fit(x_train, y_train)
## Function to calculate recall score
def get_recall_score(model):
'''
model : classifier to predict values of X
'''
pred_train = model.predict(x_train)
pred_test = model.predict(x_test)
print("Recall on training set : ",metrics.recall_score(y_train,pred_train))
print("Recall on test set : ",metrics.recall_score(y_test,pred_test))
# Accuracy on train and test
print("Accuracy on training set : ",dTree1.score(x_train, y_train))
print("Accuracy on test set : ",dTree1.score(x_test, y_test))
# Recall on train and test
get_recall_score(dTree1)
dTree1 = DecisionTreeClassifier(criterion = 'gini',max_depth=4,random_state=1)
dTree1.fit(x_train, y_train)
## Function to calculate recall score
def get_recall_score(model):
'''
model : classifier to predict values of X
'''
pred_train = model.predict(x_train)
pred_test = model.predict(x_test)
print("Recall on training set : ",metrics.recall_score(y_train,pred_train))
print("Recall on test set : ",metrics.recall_score(y_test,pred_test))
# Accuracy on train and test
print("Accuracy on training set : ",dTree1.score(x_train, y_train))
print("Accuracy on test set : ",dTree1.score(x_test, y_test))
# Recall on train and test
get_recall_score(dTree1)
dTree1 = DecisionTreeClassifier(criterion = 'gini',max_depth=5,random_state=1)
dTree1.fit(x_train, y_train)
## Function to calculate recall score
def get_recall_score(model):
'''
model : classifier to predict values of X
'''
pred_train = model.predict(x_train)
pred_test = model.predict(x_test)
print("Recall on training set : ",metrics.recall_score(y_train,pred_train))
print("Recall on test set : ",metrics.recall_score(y_test,pred_test))
# Accuracy on train and test
print("Accuracy on training set : ",dTree1.score(x_train, y_train))
print("Accuracy on test set : ",dTree1.score(x_test, y_test))
# Recall on train and test
get_recall_score(dTree1)
dTree1 = DecisionTreeClassifier(criterion = 'gini',max_depth=6,random_state=1)
dTree1.fit(x_train, y_train)
## Function to calculate recall score
def get_recall_score(model):
'''
model : classifier to predict values of X
'''
pred_train = model.predict(x_train)
pred_test = model.predict(x_test)
print("Recall on training set : ",metrics.recall_score(y_train,pred_train))
print("Recall on test set : ",metrics.recall_score(y_test,pred_test))
# Accuracy on train and test
print("Accuracy on training set : ",dTree1.score(x_train, y_train))
print("Accuracy on test set : ",dTree1.score(x_test, y_test))
# Recall on train and test
get_recall_score(dTree1)
Getting better!
dTree1 = DecisionTreeClassifier(criterion = 'gini',max_depth=7,random_state=1)
dTree1.fit(x_train, y_train)
## Function to calculate recall score
def get_recall_score(model):
'''
model : classifier to predict values of X
'''
pred_train = model.predict(x_train)
pred_test = model.predict(x_test)
print("Recall on training set : ",metrics.recall_score(y_train,pred_train))
print("Recall on test set : ",metrics.recall_score(y_test,pred_test))
# Accuracy on train and test
print("Accuracy on training set : ",dTree1.score(x_train, y_train))
print("Accuracy on test set : ",dTree1.score(x_test, y_test))
# Recall on train and test
get_recall_score(dTree1)
Recall and accuracy both peek at depth =6 Let's change min leaf size
dTree1 = DecisionTreeClassifier(criterion = 'gini',max_depth=6,min_samples_leaf=1,random_state=1)
dTree1.fit(x_train, y_train)
## Function to calculate recall score
def get_recall_score(model):
'''
model : classifier to predict values of X
'''
pred_train = model.predict(x_train)
pred_test = model.predict(x_test)
print("Recall on training set : ",metrics.recall_score(y_train,pred_train))
print("Recall on test set : ",metrics.recall_score(y_test,pred_test))
# Accuracy on train and test
print("Accuracy on training set : ",dTree1.score(x_train, y_train))
print("Accuracy on test set : ",dTree1.score(x_test, y_test))
# Recall on train and test
get_recall_score(dTree1)
dTree1 = DecisionTreeClassifier(criterion = 'gini',max_depth=6,min_samples_leaf=2,random_state=1)
dTree1.fit(x_train, y_train)
## Function to calculate recall score
def get_recall_score(model):
'''
model : classifier to predict values of X
'''
pred_train = model.predict(x_train)
pred_test = model.predict(x_test)
print("Recall on training set : ",metrics.recall_score(y_train,pred_train))
print("Recall on test set : ",metrics.recall_score(y_test,pred_test))
# Accuracy on train and test
print("Accuracy on training set : ",dTree1.score(x_train, y_train))
print("Accuracy on test set : ",dTree1.score(x_test, y_test))
# Recall on train and test
get_recall_score(dTree1)
dTree1 = DecisionTreeClassifier(criterion = 'gini',max_depth=6,min_samples_leaf=3,random_state=1)
dTree1.fit(x_train, y_train)
## Function to calculate recall score
def get_recall_score(model):
'''
model : classifier to predict values of X
'''
pred_train = model.predict(x_train)
pred_test = model.predict(x_test)
print("Recall on training set : ",metrics.recall_score(y_train,pred_train))
print("Recall on test set : ",metrics.recall_score(y_test,pred_test))
# Accuracy on train and test
print("Accuracy on training set : ",dTree1.score(x_train, y_train))
print("Accuracy on test set : ",dTree1.score(x_test, y_test))
# Recall on train and test
get_recall_score(dTree1)
dTree1 = DecisionTreeClassifier(criterion = 'gini',max_depth=6,min_samples_leaf=4,random_state=1)
dTree1.fit(x_train, y_train)
## Function to calculate recall score
def get_recall_score(model):
'''
model : classifier to predict values of X
'''
pred_train = model.predict(x_train)
pred_test = model.predict(x_test)
print("Recall on training set : ",metrics.recall_score(y_train,pred_train))
print("Recall on test set : ",metrics.recall_score(y_test,pred_test))
# Accuracy on train and test
print("Accuracy on training set : ",dTree1.score(x_train, y_train))
print("Accuracy on test set : ",dTree1.score(x_test, y_test))
# Recall on train and test
get_recall_score(dTree1)
dTree1 = DecisionTreeClassifier(criterion = 'gini',max_depth=6,min_samples_leaf=5,random_state=1)
dTree1.fit(x_train, y_train)
## Function to calculate recall score
def get_recall_score(model):
'''
model : classifier to predict values of X
'''
pred_train = model.predict(x_train)
pred_test = model.predict(x_test)
print("Recall on training set : ",metrics.recall_score(y_train,pred_train))
print("Recall on test set : ",metrics.recall_score(y_test,pred_test))
# Accuracy on train and test
print("Accuracy on training set : ",dTree1.score(x_train, y_train))
print("Accuracy on test set : ",dTree1.score(x_test, y_test))
# Recall on train and test
get_recall_score(dTree1)
A max depth of 6 and max leaf size of 5 are ideal,let's do a grid search to make sure we didn't miss anything
from sklearn.model_selection import GridSearchCV
# Choose the type of classifier.
estimator = DecisionTreeClassifier(random_state=1)
# Grid of parameters to choose from
## add from article
parameters = {'max_depth': [1,2,3,4,5,6,7,8,9,10, None],
'criterion': ['gini','entropy'],
'splitter': ['best','random'],
'min_samples_leaf':[1,2,3,4,5,6,7,8,9,10]
}
# Type of scoring used to compare parameter combinations
acc_scorer = metrics.make_scorer(metrics.recall_score)
# Run the grid search
grid_obj = GridSearchCV(estimator, parameters, scoring='recall',cv=3)
grid_obj = grid_obj.fit(x_train, y_train)
# Set the clf to the best combination of parameters
estimator = grid_obj.best_estimator_
# Fit the best algorithm to the data.
estimator.fit(x_train, y_train)
Check it's score?
# Accuracy on train and test
print("Accuracy on training set : ",estimator.score(x_train, y_train))
print("Accuracy on test set : ",estimator.score(x_test, y_test))
# Recall on train and test
get_recall_score(estimator)
Ok so it did well with training but not as well with test, I will stick with my original number
dTree1 = DecisionTreeClassifier(criterion = 'gini',max_depth=6,min_samples_leaf=5,random_state=1)
dTree1.fit(x_train, y_train)
## Function to calculate recall score
def get_recall_score(model):
'''
model : classifier to predict values of X
'''
pred_train = model.predict(x_train)
pred_test = model.predict(x_test)
print("Recall on training set : ",metrics.recall_score(y_train,pred_train))
print("Recall on test set : ",metrics.recall_score(y_test,pred_test))
# Accuracy on train and test
print("Accuracy on training set : ",dTree1.score(x_train, y_train))
print("Accuracy on test set : ",dTree1.score(x_test, y_test))
# Recall on train and test
get_recall_score(dTree1)
Importance matrix
print (pd.DataFrame(dTree1.feature_importances_, columns = ["Imp"], index = x_train.columns).sort_values(by = 'Imp', ascending = False))
education and income are most important, family and CCAVg also atleast somewhat important Now, let's do some post pruning
clf = DecisionTreeClassifier(random_state=1)
path = clf.cost_complexity_pruning_path(x_train, y_train)
ccp_alphas, impurities = path.ccp_alphas, path.impurities
clf = DecisionTreeClassifier(random_state=1)
path = clf.cost_complexity_pruning_path(x_train, y_train)
ccp_alphas, impurities = path.ccp_alphas, path.impurities
pd.DataFrame(path)
For obvious reason, impurities increase with ccp alpha
fig, ax = plt.subplots(figsize=(10,5))
ax.plot(ccp_alphas[:-1], impurities[:-1], marker='o', drawstyle="steps-post")
ax.set_xlabel("effective alpha")
ax.set_ylabel("total impurity of leaves")
ax.set_title("Total Impurity vs effective alpha for training set")
plt.show()
clfs = []
for ccp_alpha in ccp_alphas:
clf = DecisionTreeClassifier(random_state=1, ccp_alpha=ccp_alpha)
clf.fit(x_train, y_train)
clfs.append(clf)
print("Number of nodes in the last tree is: {} with ccp_alpha: {}".format(
clfs[-1].tree_.node_count, ccp_alphas[-1]))
clfs = clfs[:-1]
ccp_alphas = ccp_alphas[:-1]
node_counts = [clf.tree_.node_count for clf in clfs]
depth = [clf.tree_.max_depth for clf in clfs]
fig, ax = plt.subplots(2, 1,figsize=(10,7))
ax[0].plot(ccp_alphas, node_counts, marker='o', drawstyle="steps-post")
ax[0].set_xlabel("alpha")
ax[0].set_ylabel("number of nodes")
ax[0].set_title("Number of nodes vs alpha")
ax[1].plot(ccp_alphas, depth, marker='o', drawstyle="steps-post")
ax[1].set_xlabel("alpha")
ax[1].set_ylabel("depth of tree")
ax[1].set_title("Depth vs alpha")
fig.tight_layout()
train_scores = [clf.score(x_train, y_train) for clf in clfs]
test_scores = [clf.score(x_test, y_test) for clf in clfs]
fig, ax = plt.subplots(figsize=(10,5))
ax.set_xlabel("alpha")
ax.set_ylabel("accuracy")
ax.set_title("Accuracy vs alpha for training and testing sets")
ax.plot(ccp_alphas, train_scores, marker='o', label="train",
drawstyle="steps-post")
ax.plot(ccp_alphas, test_scores, marker='o', label="test",
drawstyle="steps-post")
ax.legend()
plt.show()
index_best_model = np.argmax(test_scores)
best_model = clfs[index_best_model]
print(best_model)
print('Training accuracy of best model: ',best_model.score(x_train, y_train))
print('Test accuracy of best model: ',best_model.score(x_test, y_test))
This makes sense, about what we got before recall is iportant here too, as we don't have pay back rates we will want to look at who WILL accept the loan
recall_train=[]
for clf in clfs:
pred_train3=clf.predict(x_train)
values_train=metrics.recall_score(y_train,pred_train3)
recall_train.append(values_train)
recall_test=[]
for clf in clfs:
pred_test3=clf.predict(x_test)
values_test=metrics.recall_score(y_test,pred_test3)
recall_test.append(values_test)
recall_test=[]
for clf in clfs:
pred_test3=clf.predict(x_test)
values_test=metrics.recall_score(y_test,pred_test3)
recall_test.append(values_test)
# creating the model where we get highest train and test recall
index_best_model = np.argmax(recall_test)
best_model = clfs[index_best_model]
print(best_model)
Confusion matrix time
def make_confusion_matrix(model,y_actual,labels=[1, 0]):
'''
model : classifier to predict values of X
y_actual : ground truth
'''
y_predict = model.predict(x_test)
cm=metrics.confusion_matrix( y_actual, y_predict, labels=[0, 1])
df_cm = pd.DataFrame(cm, index = [i for i in ["Actual - No","Actual - Yes"]],
columns = [i for i in ['Predicted - No','Predicted - Yes']])
group_counts = ["{0:0.0f}".format(value) for value in
cm.flatten()]
group_percentages = ["{0:.2%}".format(value) for value in
cm.flatten()/np.sum(cm)]
labels = [f"{v1}\n{v2}" for v1, v2 in
zip(group_counts,group_percentages)]
labels = np.asarray(labels).reshape(2,2)
plt.figure(figsize = (10,7))
sns.heatmap(df_cm, annot=labels,fmt='')
plt.ylabel('True label')
plt.xlabel('Predicted label')
make_confusion_matrix(best_model,y_test)
Specificty= 1341/1351=0.993 alittle better than linear regresion Sensitivty= 134/149=0.899 this is much better than lin regression, I would HIGHLY recommend using a decisoon tree with depth 6 and leaf size 5 for future studies Let's look at importance mat agai
print (pd.DataFrame(dTree1.feature_importances_, columns = ["Imp"], index = x_train.columns).sort_values(by = 'Imp', ascending = False))
Insights 1: use a decision tree with depth 6 and leaf size 5 2: focus on high income indivudals (from previous part we saw this was positive relationship) 3: focus on people with above an undegrad education, as again we saw a positive rleationship 4: focus on fmailies of 3+ (likely having atleast one kid or other dependnet) as we say larger fmailies had a positive relationship with acceptance 5: do not focus too much on age or experience as long as it doesn't impact previous points 6: area (zip) is mostly meaningless, hint why it is removed 7: We don't care if they use online portal or if they have another banks' card/ 8:DO NOT USE TO PREDICT WHO WILL PAY OFF LOANS